library(tidyverse)
library(corrplot)
library("PerformanceAnalytics")
library(gtsummary)
library(flextable)
library(caret)
library(glmnet)
library(e1071)
food_data <- read_csv("Food_Supply_kcal_Data.csv")
head(food_data)
## # A tibble: 6 x 32
## Country `Alcoholic Bever… `Animal Product… `Animal fats` `Aquatic Products…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanist… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Antigua a… 2.30 15.4 1.54 0
## 6 Argentina 1.44 15.0 1.06 0
## # … with 27 more variables: Cereals - Excluding Beer <dbl>, Eggs <dbl>,
## # Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>, Meat <dbl>,
## # Milk - Excluding Butter <dbl>, Miscellaneous <dbl>, Offals <dbl>,
## # Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>,
## # Stimulants <dbl>, Sugar Crops <dbl>, Sugar & Sweeteners <dbl>,
## # Treenuts <dbl>, Vegetal Products <dbl>, Vegetable Oils <dbl>,
## # Vegetables <dbl>, Obesity <dbl>, Undernourished <chr>, Confirmed <dbl>,
## # Deaths <dbl>, Recovered <dbl>, Active <dbl>, Population <dbl>,
## # Unit (all except Population) <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data %>%
drop_na() %>%
mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
#creating a column that specifies whether each country falls above or below the median
mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")),
Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))
food_data <- food_data %>%
select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`,
-Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
## Country `Alcoholic Bevera… `Animal Product… `Animal fats` `Aquatic Products…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanis… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Argentina 1.44 15.0 1.06 0
## 6 Armenia 0.227 12.8 1.77 0
## # … with 22 more variables: Cereals - Excluding Beer <dbl>, Eggs <dbl>,
## # Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>, Meat <dbl>,
## # Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## # Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>, Sugar Crops <dbl>,
## # Sugar & Sweeteners <dbl>, Treenuts <dbl>, Vegetal Products <dbl>,
## # Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")
chart.Correlation(food_data[,2:24], histogram=TRUE)
# summary statistics
food_data_1 <- food_data %>%
select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.76 (1.15) | 0.86 (0.78) | <0.001 |
Animal Products | 154 | 11.5 (4.1) | 6.6 (4.2) | <0.001 |
Animal fats | 154 | 1.77 (1.50) | 0.76 (0.85) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.52 (0.26) | 0.32 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.61 (0.60) | 0.57 (0.50) | 0.7 |
Fruits - Excluding Wine | 154 | 2.21 (1.46) | 1.80 (1.43) | 0.082 |
Meat | 154 | 4.44 (1.68) | 2.93 (2.25) | <0.001 |
Milk - Excluding Butter | 154 | 3.99 (1.84) | 1.89 (1.72) | <0.001 |
Offals | 154 | 0.15 (0.09) | 0.14 (0.13) | 0.6 |
Oilcrops | 154 | 0.66 (0.83) | 1.43 (1.85) | 0.001 |
Pulses | 154 | 0.79 (0.73) | 1.48 (1.52) | <0.001 |
Spices | 154 | 0.18 (0.24) | 0.19 (0.25) | >0.9 |
Starchy Roots | 154 | 2.2 (2.4) | 4.3 (5.0) | 0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.16) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.005 |
Sugar & Sweeteners | 154 | 5.62 (1.63) | 3.92 (2.22) | <0.001 |
Treenuts | 154 | 0.32 (0.29) | 0.22 (0.29) | 0.035 |
Vegetal Products | 154 | 38.5 (4.1) | 43.4 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.09 (2.17) | 4.66 (2.19) | 0.2 |
Vegetables | 154 | 1.20 (0.63) | 0.93 (0.63) | 0.008 |
Confirmed_per_capita | 154 | 161 (282) | 1 (2) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.05) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
food_data_2 <- food_data %>%
select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.75 (1.15) | 0.88 (0.80) | <0.001 |
Animal Products | 154 | 11.4 (4.2) | 6.7 (4.2) | <0.001 |
Animal fats | 154 | 1.78 (1.50) | 0.75 (0.83) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.53 (0.26) | 0.31 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.60 (0.60) | 0.58 (0.50) | 0.8 |
Fruits - Excluding Wine | 154 | 2.08 (1.37) | 1.94 (1.54) | 0.5 |
Meat | 154 | 4.38 (1.74) | 2.99 (2.24) | <0.001 |
Milk - Excluding Butter | 154 | 3.98 (1.84) | 1.90 (1.73) | <0.001 |
Offals | 154 | 0.14 (0.08) | 0.14 (0.14) | >0.9 |
Oilcrops | 154 | 0.68 (0.94) | 1.42 (1.81) | 0.002 |
Pulses | 154 | 0.78 (0.68) | 1.49 (1.54) | <0.001 |
Spices | 154 | 0.17 (0.23) | 0.20 (0.25) | 0.5 |
Starchy Roots | 154 | 2.0 (2.3) | 4.5 (5.0) | <0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.17) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.004 |
Sugar & Sweeteners | 154 | 5.67 (1.65) | 3.86 (2.16) | <0.001 |
Treenuts | 154 | 0.31 (0.28) | 0.22 (0.30) | 0.045 |
Vegetal Products | 154 | 38.6 (4.2) | 43.3 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.24 (2.20) | 4.52 (2.12) | 0.041 |
Vegetables | 154 | 1.19 (0.64) | 0.94 (0.62) | 0.013 |
Confirmed_per_capita | 154 | 157 (283) | 5 (27) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.04) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
# prepare data for cross validation
food_data <- food_data %>%
# delete country since only each country has only one observation
select(-`Animal Products`, -Country,-`Vegetal Products`)
food_data %>% head()
## # A tibble: 6 x 24
## `Alcoholic Bevera… `Animal fats` `Aquatic Products,… `Cereals - Exclud… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>,
## # Meat <dbl>, Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>,
## # Sugar Crops <dbl>, Sugar & Sweeteners <dbl>, Treenuts <dbl>,
## # Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
library(GGally)
# a panel of scatterplots using `GGally`
ggpairs(data = food_data[,c(1:20,23)],
ggplot2::aes(colour = Cases_vs_median, alpha =0.5))
# numeric
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Confirmed_per_capita, k=10)
result_lm <- list()
result_knn <- list()
result_lasso <- list()
result_ridge <- list()
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
lm_fit = lm(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10)
knn_fit = train(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10,
method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
trControl = trainControl(method="cv", number = 5))
lasso_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,21]), alpha=1)
ridge_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,21]), alpha=0)
#linear
food_data_test_cv_10$Confirmed_lm_predict <- predict(lm_fit, newdata = food_data_test_cv_10)
result_lm[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lm_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lm_predict)
#knn
food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
result_knn[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_knn_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_knn_predict)
#lasso
food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:20)])) %*% coef(lasso_fit, s = "lambda.min")
result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lasso_predict)
#ridge
food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:20)])) %*% coef(ridge_fit, s = "lambda.min")
result_ridge[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_ridge_predict)
}
data.frame_lm = data.frame("CV_error" = mean(unlist(result_lm)), "CV_error_SE" = sd(unlist(result_lm)))
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
rbind("Linear Regression" = data.frame_lm, "KNN" = data.frame_knn, "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
## CV_error CV_error_SE
## Linear Regression 64728.59 91521.07
## KNN 37346.76 44964.51
## Lasso 30592.48 32348.18
## Ridge 41715.88 45982.01
food_data <- food_data %>%
mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))
# categorical -> predict cases
library(rBayesianOptimization)
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
result_qda <- list()
result_lasso <- list()
result_ridge <- list()
food_data %>% head()
## # A tibble: 6 x 24
## `Alcoholic Bevera… `Animal fats` `Aquatic Products,… `Cereals - Exclud… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>,
## # Meat <dbl>, Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>,
## # Sugar Crops <dbl>, Sugar & Sweeteners <dbl>, Treenuts <dbl>,
## # Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
food_data$Cases_vs_median <- as_factor(food_data$Cases_vs_median)
food_data$Deaths_vs_median <- as_factor(food_data$Deaths_vs_median)
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
# knn
set.seed(2342)
knn_fit <- train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10,
method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
trControl = trainControl(method="cv", number = 5))
# logistic
logistic_fit <- glm(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, family = binomial())
# SVM - linear
set.seed(2342)
svr_linear_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="linear", ranges=list(cost=1:3, epsilon=c(0.1,0.5,1)))
linear_svm_fit <- svr_linear_tune$best.model
# SVM - radial
set.seed(2342)
svr_radial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="radial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, epsilon=c(0.1,0.5,1)))
radial_svm_fit <- svr_radial_tune$best.model
# SVM - polynomial
set.seed(2342)
svr_polynomial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="polynomial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, d=seq(2,3)))
polynomial_svm_fit <- svr_polynomial_tune$best.model
# QDA
frmla <-as.formula(paste0(names(food_data_train_cv_10)[23], "~", paste0("`", names(food_data_train_cv_10)[-c(3,21:24)], "`", collapse="+")))
qda_fit <- train(frmla, data = food_data_train_cv_10, method = "qda")
# lasso & ridge
lasso_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,23]), alpha=1, family="binomial")
ridge_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,23]), alpha=0, family="binomial")
# knn
food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Confirmed_knn_predict,
reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_knn_predict)
#logistic
food_data_test_cv_10$Confirmed_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
food_data_test_cv_10$Confirmed_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_logistic_predict_num >= 0.5, "1", "0"))
result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_logistic_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_logistic_predict, -Confirmed_logistic_predict_num)
# linear svm
food_data_test_cv_10$Confirmed_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_linear_svm_predict)
# radial svm
food_data_test_cv_10$Confirmed_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_radial_svm_predict)
# polynomial svm
food_data_test_cv_10$Confirmed_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_polynomial_svm_predict)
# qda
food_data_test_cv_10$Confirmed_qda_predict_num <- predict(qda_fit, newdata = food_data_test_cv_10, type = "prob", na.action=na.exclude)$`1`
food_data_test_cv_10$Confirmed_qda_predict = relevel(factor(ifelse(food_data_test_cv_10$Confirmed_qda_predict_num > 0.5, "1", "0")), ref = "0")
result_qda[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_qda_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_qda_predict)
# Lasso
# probability of being in class 1
food_data_test_cv_10$Confirmed_lasso_predict_num <- predict(lasso_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
food_data_test_cv_10$Confirmed_lasso_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_lasso_predict_num >= 0.5, "1", "0"))
result_lasso[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_lasso_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_lasso_predict, -Confirmed_lasso_predict_num)
# Ridge
food_data_test_cv_10$Confirmed_ridge_predict_num <- predict(ridge_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
food_data_test_cv_10$Confirmed_ridge_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_ridge_predict_num >= 0.5, "1", "0"))
result_ridge[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_ridge_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Confirmed_ridge_predict, -Confirmed_ridge_predict_num)
}
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))
data.frame_qda = data.frame("CV_error" = mean(unlist(result_qda)), "CV_error_SE" = sd(unlist(result_qda)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
summary_table <-
rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm,
"Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm, "QDA" = data.frame_qda,
"Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
Algorithm Name | CV_error | CV_error_SE |
KNN | 0.24 | 0.083 |
Logistic Regression | 0.21 | 0.116 |
Linear SVM | 0.24 | 0.142 |
Radial SVM | 0.25 | 0.102 |
Polynomial SVM | 0.25 | 0.135 |
QDA | 0.26 | 0.122 |
Lasso | 0.20 | 0.110 |
Ridge | 0.23 | 0.111 |
###### lasso best model
lasso_best <- cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,23]), alpha=1)
coef(lasso_best, s = "lambda.min")
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.01933817
## Alcoholic Beverages 0.04015531
## Animal fats .
## Aquatic Products, Other .
## Cereals - Excluding Beer -0.00113986
## Eggs .
## Fish, Seafood .
## Fruits - Excluding Wine 0.01518515
## Meat .
## Milk - Excluding Butter 0.04491741
## Offals .
## Oilcrops .
## Pulses .
## Spices .
## Starchy Roots .
## Stimulants 0.30721984
## Sugar Crops -0.09814204
## Sugar & Sweeteners 0.04135097
## Treenuts .
## Vegetable Oils .
## Vegetables .
library(randomForest)
# random forest
names(food_data) <- make.names(names(food_data))
food_data %>% head()
## # A tibble: 6 x 24
## Alcoholic.Beverag… Animal.fats Aquatic.Products..… Cereals...Excluding… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: Fish..Seafood <dbl>, Fruits...Excluding.Wine <dbl>,
## # Meat <dbl>, Milk...Excluding.Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, Starchy.Roots <dbl>, Stimulants <dbl>,
## # Sugar.Crops <dbl>, Sugar...Sweeteners <dbl>, Treenuts <dbl>,
## # Vegetable.Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result <- list()
result_rf <- list()
n_list <- c(50, 250, 500)
p <- ncol(food_data) - 4
c(p/2, sqrt(p), p)
## [1] 10.000000 4.472136 20.000000
m_list <- c(10, 4, 20)
reg_rf_tune <- list()
reg_rf_oob_errors <- list()
round = 0
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
for (i in 1:length(n_list)) {
for (j in 1:length(m_list)) {
round <- round + 1
set.seed(7812)
reg_rf_tune[[round]] <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median,
data = food_data_train_cv_10, ntree=n_list[i], mtry=m_list[j])
head(food_data_train_cv_10)
reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j],
"oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
}
}
reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
set.seed(9984)
reg_rf <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10,
ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
Confirmed_Predict_Prob <- predict(reg_rf, newdata = food_data_test_cv_10, type = "prob")
food_data_test_cv_10$Confirmed_rf_Predict <- colnames(Confirmed_Predict_Prob)[max.col(Confirmed_Predict_Prob, ties.method="first")]
per_class_accuracy <- rep(NA,length(levels(food_data_test_cv_10$Cases_vs_median)))
for (i in 1:length(per_class_accuracy)){
per_class_accuracy[i] = food_data_test_cv_10 %>%
filter(Cases_vs_median == levels(Cases_vs_median)[i]) %>%
summarise(accuracy = sum(Confirmed_rf_Predict ==
levels(Cases_vs_median)[i]) / n()) %>%
unlist()
names(per_class_accuracy)[i] <- paste0("accuracy_",
levels(food_data_test_cv_10$Cases_vs_median)[i])
}
result[[f]] = per_class_accuracy
result_rf[[f]] <- 1- confusionMatrix(data = as.factor(food_data_test_cv_10$Confirmed_rf_Predict),
reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
}
df = as.data.frame(matrix(unlist(result), ncol = 2, byrow = TRUE))
CV_error = c("Below" = (1-mean(df[,1], na.rm=TRUE)),
"Above" = (1-mean(df[,2], na.rm=TRUE)))
CV_error_SE = c("Below" = sd((1-df[,1]), na.rm=TRUE),
"Above" = sd((1-df[,2]), na.rm=TRUE))
data.frame(CV_error, CV_error_SE)
## CV_error CV_error_SE
## Below 0.2142857 0.2032544
## Above 0.1785714 0.1491662
rf <- data.frame("CV Error" = CV_error, "CV Error SE" = CV_error_SE)
flextable(rf %>% rownames_to_column("Class Name"))
Class Name | CV.Error | CV.Error.SE |
Below | 0.21 | 0.20 |
Above | 0.18 | 0.15 |
data.frame_rf <- data.frame("CV_error" = mean(unlist(result_rf)), "CV_error_SE" = sd(unlist(result_rf)))
summary_table <- rbind(summary_table, "Random Forest" = data.frame_rf)
summary_table
## CV_error CV_error_SE
## KNN 0.2350595 0.08279182
## Logistic Regression 0.2051786 0.11602243
## Linear SVM 0.2386310 0.14153751
## Radial SVM 0.2484524 0.10199887
## Polynomial SVM 0.2502381 0.13471454
## QDA 0.2573214 0.12212815
## Lasso 0.2016071 0.10989594
## Ridge 0.2279762 0.11053066
## Random Forest 0.1976190 0.09852584
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
Algorithm Name | CV_error | CV_error_SE |
KNN | 0.24 | 0.083 |
Logistic Regression | 0.21 | 0.116 |
Linear SVM | 0.24 | 0.142 |
Radial SVM | 0.25 | 0.102 |
Polynomial SVM | 0.25 | 0.135 |
QDA | 0.26 | 0.122 |
Lasso | 0.20 | 0.110 |
Ridge | 0.23 | 0.111 |
Random Forest | 0.20 | 0.099 |
# best random forest - top 10 predictors
reg_rf_tune <- list()
reg_rf_oob_errors <- list()
round = 0
for (i in 1:length(n_list)) {
for (j in 1:length(m_list)) {
round <- round + 1
set.seed(7812)
reg_rf_tune[[round]] <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median,
data = food_data, ntree=n_list[i], mtry=m_list[j])
reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j],
"oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
}
}
reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
set.seed(9984)
reg_rf <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data,
ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
varImpPlot(reg_rf, sort = TRUE, n.var = 10, main = "Top 10 Important Variables Predicting Cases vs Median")
varImpPlot(reg_rf, sort = TRUE)